library(tidyverse)
library(stringi)
library(quanteda)
library(igraph)
library(Matrix)

language <- "ko" ## or "ja" 

data_tweet <- read_csv(paste0(in_data_dir, "data_tweet.csv.tar.gz"),
                       col_types = c(id_str = "c", 
                                     user_id_str = "c"))

data_retweet_user_ids <- read_csv(paste0(in_data_dir, "data_retweet_user_ids_", language,".csv.tar.gz"),
                                  col_types = c(
                                    retweeted_status_id_str = 'c', 
                                    id_str = 'c', 
                                    user_id_str = 'c', 
                                    retweeted_status_user_id_str = 'c'
                                  ))

## negerate a network graph object
graph_retweet_network <- 
  data_retweet_user_ids %>%
  select(id_str, retweeted_status_user_id_str) %>%
  rename(from = id_str, to = retweeted_status_user_id_str) %>% 
  graph_from_data_frame()

## calculate digree centrality
degree_centrality <- degree(graph_retweet_network, mode = "in")
data_centrality <- 
  tibble(user_id_str = names(degree_centrality), 
         deg_centrality = degree_centrality)

## generate retweet correspondence object in Quanteda (fcm)
data_retweet_user_ids_collapsed <- data_retweet_user_ids %>%
  group_by(user_id_str) %>%
  filter(n() > 1) %>%
  group_by(retweeted_status_user_id_str) %>%
  summarize(count = n(),
            collapsed_user_id_str = paste(user_id_str, collapse = " ")) %>%
  ungroup() 

quanteda_options(threads = 8)
dfm_retweet_ids <-
  data_retweet_user_ids_collapsed %>% corpus(text_field = "collapsed_user_id_str",
                                             docid_field = "retweeted_status_user_id_str") %>%
  tokens() %>%
  dfm() %>%
  dfm_trim(min_docfreq = 2)

fcm_retweet_ids <- dfm_retweet_ids %>% t %>% fcm(count = 'boolean', tri = F)

## convert retweet corresponcence object to network graph object
graph_retweet_correspondence <- fcm_retweet_ids %>% as("sparseMatrix") %>%
  graph_from_adjacency_matrix(mode = 'undirected', diag = F, weighted = T) %>%
  #delete_edges(which(E(.)$weight <= 10)) %>%
  delete_vertices(., degree(.) == 0)

## estimate commnity
cluster_retweet_correspondence_louvain <- cluster_louvain(graph_retweet_correspondence)

data_user_retweet_network <- 
  data_centrality %>%
  merge(cluster_retweet_correspondence_louvain %>% stack %>% 
          rename(cluster_louvain = ind, user_id_str = values))

## choose clusters to use
if(language == "ja") {
  cluster_names <- tibble(cluster_louvain = c(25, 24, 6, 5, 11),
                            cluster_title = c(
                              "Xenophobic, right-wing extremist",
                              "News media, news aggregator",
                              "Right-wing News commentary and curation",
                              "Left-wing and anti-right-wing",
                              "Not political")) %>%
    mutate(cluster_title_short = c("Rightist","Media", "Rightist (News)", "Leftist", "Non Political"))
  threshold <- 9
} else if (language == "ko") {
  cluster_names <- 
    tibble(cluster_louvain = c(4, 5, 6, 2),
         cluster_title = c("Leftist", "Official", "Media", "Rightist")) %>%
    mutate(cluster_title_short = cluster_title)
  threshold <- 10
}

## Generate cluster membership 
data_user_retweet_network %>% 
  merge(cluster_names) %>%
  select(user_id_str, cluster_louvain, cluster_title_short, deg_centrality) %>%
  arrange(cluster_louvain, -deg_centrality) %>%
  write_csv(paste0(out_data_dir, "data_account_", language,".csv"))
  
## Generate table 2
data_user_retweet_network %>% 
  merge(cluster_names) %>%
  group_by(cluster_title) %>%
  summarize(count = n()) %>%
  arrange(-count)

## Generate Figure 1
tmp_mat <- fcm_retweet_ids %>% as("matrix") 
tmp_mat[tmp_mat < threshold] <- 0
graph_retweet_correspondence_2 <- tmp_mat %>%
  graph_from_adjacency_matrix(mode = 'undirected', diag = F, weighted = T) %>%
  delete_vertices(., degree(.) == 0)

if(language == "ja") set.seed(06092020) else set.seed(06162020)

data_network_plot <- 
  layout_with_lgl(graph_retweet_correspondence_2) %>%
  data.frame() %>% rename(x = 1, y = 2) %>%
  mutate_at(1:2, scale) %>%
  mutate(user_id_str = V(graph_retweet_correspondence_2) %>% names) %>%
  left_join(data_user_retweet_network, by = "user_id_str") 

edgelist <- get.edgelist(graph_retweet_correspondence_2, names=FALSE)
data_edges <- data.frame(data_network_plot[edgelist[,1],c("x", "y", "cluster_louvain")], 
                         data_network_plot[edgelist[,2],c("x", "y", "cluster_louvain")]) %>%
  rename(x1 = 1, y1 = 2, cluster_louvain1 = 3,
         x2 = 4, y2 = 5, cluster_louvain2 = 6) %>%
  filter(cluster_louvain1 %in% cluster_names$cluster_louvain) %>% 
  filter(cluster_louvain2 %in% cluster_names$cluster_louvain) %>%
  mutate(screen_name = NA)

p <- data_network_plot %>% 
  filter(cluster_louvain %in% cluster_names$cluster_louvain) %>%
  merge(cluster_names) %>%
  ungroup() %>%
  ggplot(aes(x=x, y=y, color=cluster_title_short)) + 
  geom_segment(
    aes(x = x1, y = y1, xend = x2, yend = y2), 
    data=data_edges, size=0.25, color="grey", alpha=1/3) +
  geom_point()

p <- p + theme_minimal() + 
  scale_color_brewer("Community", palette = "Set1") + 
  theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), 
        axis.title = element_blank(), panel.border = element_blank(), 
        panel.grid.major = element_blank(), panel.grid.minor = element_blank()
  )
ggsave(paste0("community_visual_", language ,".png"), plot = p, width = 8, height = 5, path = fig_dir)
if(language == "ja") {
  ggsave(paste0("figure1.pdf"), plot = p, width = 8, height = 5, path = fig_dir) 
} else if (language == "ko") {
  ggsave(paste0("figure2.pdf"), plot = p, width = 8, height = 5, path = fig_dir) 
}


